#####################methods for cross validated approach#####################################
#####################functions to automate the use of the cv.glmnet function################
####calculating model accuracy#####
#' en_kfold_accuracy function
#'
#' This function calculates the cross validated accuracy of an enr model
#' @param ddata data frame containing the data to be summarized
#' @param response_var vector of variable names for the variables that are t be summarized. all variables must be present in the data frame
#' @param mod_alpha optional class variable to cross with the independent variables. this is is typically presented as a text (e.g. data$variable)vector of names to change the rows in column 1 to. This is typically what you want the individual variables that the rows represent called in the table. this must be the same length as vars. default is NULL which simply returns vars.
#' @param mod_lambda vector of boolean values indicating whether the variable should be 'fp' (true) or custom/msd (false). default is true
#' @param iter a vector of the first function to use (i.e. outside the parentheses) if fp='FALSE'. default is 'mean'. if supplying different functions be sure to quote e.g. "IQR"
#' @param k a vector of the second function to use (i.e. inside the parentheses) if fp='FALSE'. default is 'sd'. if supplying different functions be sure to quote e.g. "IQR"
#' @param seed a vector of variable names to use in the table. If left as null the table will contain the variable names as listed in the data frame
#' @param loo boolean value indicating whether the variable name should be added to the table (TRUE) or not (FALSE). default is TRUE
#' @param eq_wt boolean indicating whether duplicate values of the variable name should be removed (TRUE) or included (FALSE). default is TRUE
#' @param type_meas boolean indicating if the n value used after na removal should be displayed in non fp cases. this will only appear when fp=FASLE. default is TRUE
#' @param lr_cutoff boolean indicating if a 'total' column should be added to the data frame. default is TRUE
#' @keywords fp_msd_class2_msdmulti variables fp_msd_class2 multiple multi class fp_msd summary table1 table2
#' @export
#' @examples
#' en_kfold_accuracy()
#'
en_kfold_accuracy<-function(ddata,response_var,mod_alpha,mod_lambda=NULL,iter=100,k=10,seed=123,loo=FALSE,eq_wt=FALSE,type_meas = "deviance",lr_cutoff=c(.5)){
print("cutoff")
print(lr_cutoff)
kfoldcv_results_data<-data.frame()
set.seed(seed)
itern<-1
ac_model_form <- as.formula(paste(response_var, "~", paste(names(ddata[,!names(ddata) %in% c(response_var)]), collapse=" + ")))
loo_tracker<-vector()
if(loo==TRUE){
k<-nrow(ddata)
iter<-1
}
pbar <- create_progress_bar('text')
pbar$init(iter*k)
for(i in 1:iter){
if(loo==FALSE){
ddata<-ddata[order(runif(nrow(ddata))),]
}
ddata$grp<-rep(1:k,length.out=nrow(ddata))
for(j in 1:k)
{
itername<-paste("cv_",i,"_reffold_",j,sep = "")
if(loo==TRUE){
train<-ddata[-j,-ncol(ddata)]
test<-ddata[j,-ncol(ddata)]
} else {
train<-ddata[ddata$grp!=k,-ncol(ddata)]
test<-ddata[ddata$grp==k,-ncol(ddata)]
}
wt<-NULL
if(eq_wt==TRUE){
wt<-ifelse(train[,1]==0,
1-(sum(train[,1]==0)/nrow(train)),
1-(sum(train[,1]==1)/nrow(train)))
}
train_pred <- model.matrix(ac_model_form,train)[,-1]
train_resp <- train[,response_var]
test_pred <- model.matrix(ac_model_form,test)[,-1]
if(loo==TRUE){
test_pred<-t(test_pred)
}
test_resp <- test[,response_var]
# Fitting
if(is.null(mod_lambda)){
model <- cv.glmnet(x = train_pred,
y = train_resp,
weights = wt,
type.measure=type_meas,
alpha=mod_alpha,
family="binomial")
} else {
model <- glmnet(x = train_pred,
y = train_resp,
weights = wt,
type.measure=type_meas,
alpha=mod_alpha,
lambda = mod_alpha,
family="binomial")
}
# print("alpha")
# print(mod_alpha)
# print("cutoff")
# print(lr_cutoff)
# print("test response")
# print(length(test_resp))
# print("test predict")
# print(nrow(test_pred))
# Predict results
# results_pred <- predict(model,newx=test_pred,type="class",s=model$lambda.1se)
results_pred_prob <- predict(model,newx=test_pred,type="response",s=model$lambda.1se)
#print(results_pred_prob)
#print(nrow(results_pred_prob))
#print(results_pred_prob)
#print(results_pred_prob>lr_cutoff)
results_pred <- ifelse((cton(predict(model,newx=test_pred,type="response",s=model$lambda.1se))>lr_cutoff),1,0)
#print(results_pred)
#print(nrow(results_pred))
results_pred<-factor(results_pred,levels = c("1","0"))
test_resp<-factor(test_resp,levels = c("1","0"))
#print(cbind(results_pred,test_resp,(results_pred_prob>lr_cutoff),lr_cutoff))
# Confusion matrix
cm <- caret::confusionMatrix(data = results_pred,
reference = test_resp,
positive = "1")
#AUC
mod_auc<-suppressMessages(roc(test_resp,cton(results_pred_prob))$auc)
# Collecting results
cm_results<-t(data.frame(c("iter_fold"=itername,cm$overall,cm$byClass,"AUC"=mod_auc)))
#rbind the results together
kfoldcv_results_data<-rbind(kfoldcv_results_data,cm_results)
itern<-itern+1
pbar$step()
}
}
# Average accuracy of the model
rownames(kfoldcv_results_data)<-1:nrow(kfoldcv_results_data)
kfoldcv_results<-colMeans(apply(kfoldcv_results_data[,2:ncol(kfoldcv_results_data)],2,cton),na.rm = TRUE)
kfold_results<-list("kfold_agg_results"=kfoldcv_results,"kfold_results"=kfoldcv_results_data)
return(kfold_results)
}
#########evaluate tied model fits#########
#' en_kfold_accuracy_tied function
#'
#' This function breaks ties if model performance for enr models is tied (DOCUMENTATION COMING)
#' @param tied_alphas
#' @param tied_cutoffs
#' @param ddata
#' @param response_var
#' @param iter
#' @param k
#' @param seed
#' @param favor_parsimony
#' @param eq_wt
#' @param type_meas
#' @keywords en_kfold_accuracy_tied enr tied
#' @export
#' @examples
#' en_kfold_accuracy_tied()
#'
en_kfold_accuracy_tied<-function(tied_alphas,tied_cutoffs,ddata,response_var,iter=10,k=10,favor_parsimony=TRUE,eq_wt=FALSE,seed=123,type_meas="deviance"){
set.seed(seed)
bestmods_accuracy_stats<-data.frame()
#print(ddata)
#print(response_var)
for (j in 1:length(tied_alphas)){
# print("tc")
# print(tied_cutoffs)
# print("tcj")
# print(tied_cutoffs[j])
# print("tcjj")
# print(tied_cutoffs[[j]])
bestmods_accuracy_stats_results<-en_kfold_accuracy(ddata = ddata,
response_var = response_var,
mod_alpha = tied_alphas[j],
lr_cutoff= tied_cutoffs[j],
iter = iter,
k = k,
eq_wt = eq_wt,
type_meas=type_meas,
accuracy_modeling=TRUE)
#print(j)
#print(bestmods_accuracy_stats_results)
if(j==1){
bestmods_accuracy_stats<-rown_to_var(data.frame("Value"=bestmods_accuracy_stats_results$kfold_agg_results),varname = "statistic")
} else {
bestmods_accuracy_stats<-cbind(bestmods_accuracy_stats,data.frame(bestmods_accuracy_stats_results$kfold_agg_results))
}
}
names(bestmods_accuracy_stats)<-c("metric",as.character(tied_alphas))
bestmods_accuracy_stats_lim<-subset.data.frame(bestmods_accuracy_stats,metric!="AccuracyLower"&
metric!="AccuracyUpper"&
metric!="AccuracyNull"&
metric!="AccuracyPValue"&
metric!="McnemarPValue")
if(favor_parsimony==TRUE){
bestmods_accuracy_stats_lim<-bestmods_accuracy_stats_lim[,c(1,rev(2:ncol(bestmods_accuracy_stats_lim)))]
}
choice<-function(x){
names(which.max(x))
}
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
bestmods_accuracy_stats_lim$best_alpha<-apply(bestmods_accuracy_stats_lim[,2:ncol(bestmods_accuracy_stats_lim)],
1,
choice)
best_alpha<-getmode(bestmods_accuracy_stats_lim$best_alpha)
best_alpha_value<-cton(getmode(bestmods_accuracy_stats_lim$best_alpha))
return(list("best_alpha_value"=best_alpha_value,
"best_alpha"=best_alpha,
"limited results"=bestmods_accuracy_stats_lim,
"all_results"=bestmods_accuracy_stats))
}
#########evaluate tied model fits#########
#' en_kfold_accuracy_tied_grid function
#'
#' This function breaks ties if model performance for grid search enr models is tied (DOCUMENTATION COMING)
#' @param tied_alphas
#' @param tied_cutoffs
#' @param ddata
#' @param response_var
#' @param iter
#' @param k
#' @param seed
#' @param favor_parsimony
#' @param eq_wt
#' @param type_meas
#' @keywords tied grid enr
#' @export
#' @examples
#' en_kfold_accuracy_tied_grid()
#'
en_kfold_accuracy_tied_grid<-function(tied_alphas,tied_lambdas,tied_cutoffs,ddata,response_var,iter=100,k=10,favor_parsimony=TRUE,eq_wt=FALSE,seed=522,type_meas="deviance",selection_type="modal"){
set.seed(seed)
bestmods_accuracy_stats<-data.frame()
#print(ddata)
#print(response_var)
#print("here1")
for (j in 1:length(tied_alphas)){
#print("here")
# print(tied_alphas[j])
# print(tied_lambdas[j])
# print(tied_cutoffs[j])
# print("tcjj")
# print(tied_cutoffs[[j]])
bestmods_accuracy_stats_results<-en_kfold_accuracy_grid(ddata = ddata,
response_var = response_var,
mod_alpha = tied_alphas[j],
mod_lambda = tied_lambdas[j],
lr_cutoff= tied_cutoffs[j],
iter = iter,
k = k,
eq_wt = eq_wt,
type_meas=type_meas,
#accuracy_modeling=TRUE
)
#print(j)
#print(bestmods_accuracy_stats_results)
if(j==1){
bestmods_accuracy_stats<-rown_to_var(data.frame("Value"=bestmods_accuracy_stats_results$kfold_agg_results),varname = "statistic")
} else {
bestmods_accuracy_stats<-cbind(bestmods_accuracy_stats,data.frame(bestmods_accuracy_stats_results$kfold_agg_results))
}
}
names(bestmods_accuracy_stats)<-c("metric",as.character(tied_alphas))
bestmods_accuracy_stats_lim<-subset.data.frame(bestmods_accuracy_stats,metric!="AccuracyLower"&
metric!="AccuracyUpper"&
metric!="AccuracyNull"&
metric!="AccuracyPValue"&
metric!="McnemarPValue"&
metric!="Prevalence")
print(bestmods_accuracy_stats_lim)
#print("here2")
if(favor_parsimony==TRUE){
bestmods_accuracy_stats_lim<-bestmods_accuracy_stats_lim[,c(1,rev(2:ncol(bestmods_accuracy_stats_lim)))]
}
choice<-function(x,tied_alphas,tied_lambdas,tied_cutoffs){
index<-names(which.max(x))
alpha<-tied_alphas[which.max(x)]
lambda<-tied_lambdas[which.max(x)]
cutoff<-tied_cutoffs[which.max(x)]
return(c("index"=index,"alpha"=alpha,"lambda"=lambda,"cutoff"=cutoff))
}
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
last_col<-ncol(bestmods_accuracy_stats_lim)
print(choice(bestmods_accuracy_stats_lim[1,2:last_col],
tied_alphas=tied_alphas,
tied_lambdas=tied_lambdas,
tied_cutoffs=tied_cutoffs))
type_meas_result<-choice(bestmods_accuracy_stats_lim[1,2:last_col],
tied_alphas=tied_alphas,
tied_lambdas=tied_lambdas,
tied_cutoffs=tied_cutoffs)
bestmods_accuracy_stats_lim$best_alpha<-apply(bestmods_accuracy_stats_lim[,2:last_col],
1,
function(x) choice(x,
tied_alphas=tied_alphas,
tied_lambdas=tied_lambdas,
tied_cutoffs=tied_cutoffs)["alpha"])
bestmods_accuracy_stats_lim$best_lambda<-apply(bestmods_accuracy_stats_lim[,2:last_col],
1,
function(x) choice(x,
tied_alphas=tied_alphas,
tied_lambdas=tied_lambdas,
tied_cutoffs=tied_cutoffs)["lambda"])
bestmods_accuracy_stats_lim$best_cutoff<-apply(bestmods_accuracy_stats_lim[,2:last_col],
1,
function(x) choice(x,
tied_alphas=tied_alphas,
tied_lambdas=tied_lambdas,
tied_cutoffs=tied_cutoffs)["cutoff"])
print(bestmods_accuracy_stats_lim)
print(getmode(bestmods_accuracy_stats_lim$best_alpha))
print(getmode(bestmods_accuracy_stats_lim$best_lambda))
print(getmode(bestmods_accuracy_stats_lim$best_cutoff))
#print("here6")
return(list("mode_results"=list("best_alpha"=getmode(bestmods_accuracy_stats_lim$best_alpha),
"best_lambda"=getmode(bestmods_accuracy_stats_lim$best_lambda),
"best_cutoff"=getmode(bestmods_accuracy_stats_lim$best_cutoff),
"best_alpha_value"=cton(getmode(bestmods_accuracy_stats_lim$best_alpha)),
"best_lambda_value"=cton(getmode(bestmods_accuracy_stats_lim$best_lambda)),
"best_cutoff_value"=cton(getmode(bestmods_accuracy_stats_lim$best_cutoff))),
"type_meas_results"=list("best_alpha"=type_meas_result["alpha"],
"best_lambda"=type_meas_result["lambda"],
"best_cutoff"=type_meas_result["cutoff"],
"best_alpha_value"=cton(type_meas_result["alpha"]),
"best_lambda_value"=cton(type_meas_result["lambda"]),
"best_cutoff_value"=cton(type_meas_result["cutoff"])),
"limited results"=bestmods_accuracy_stats_lim,
"all_results"=bestmods_accuracy_stats))
}
#########elastic net k fold cross validation with model selection#############
#' fp_msd_class2_fp_msd_multi function
#'
#' run kfold cross validated enr models (DOCUMENTATION COMING- CURRENT DOCUMENTATION INCORRECT)
#' @param data data frame containing the data to be summarized
#' @param indvars vector of variable names for the variables that are t be summarized. all variables must be present in the data frame
#' @param classvar optional class variable to cross with the independent variables. this is is typically presented as a text (e.g. data$variable)vector of names to change the rows in column 1 to. This is typically what you want the individual variables that the rows represent called in the table. this must be the same length as vars. default is NULL which simply returns vars.
#' @param fp_msd_vector_tf vector of boolean values indicating whether the variable should be 'fp' (true) or custom/msd (false). default is true
#' @param funct1_vector a vector of the first function to use (i.e. outside the parentheses) if fp='FALSE'. default is 'mean'. if supplying different functions be sure to quote e.g. "IQR"
#' @param funct2_vector a vector of the second function to use (i.e. inside the parentheses) if fp='FALSE'. default is 'sd'. if supplying different functions be sure to quote e.g. "IQR"
#' @param var_name_vec a vector of variable names to use in the table. If left as null the table will contain the variable names as listed in the data frame
#' @param sep_var_level boolean value indicating whether the variable name should be added to the table (TRUE) or not (FALSE). default is TRUE
#' @param remove_var_dups boolean indicating whether duplicate values of the variable name should be removed (TRUE) or included (FALSE). default is TRUE
#' @param shownval boolean indicating if the n value used after na removal should be displayed in non fp cases. this will only appear when fp=FASLE. default is TRUE
#' @param total boolean indicating if a 'total' column should be added to the data frame. default is TRUE
#' @param rnd_digs_vector the number of digits to round results to when fp=FALSE. default is 2
#' @param rownvar name of the first column (second column if sep_var_level=TRUE) which contains either indvar categories or function names. default is 'Level'
#' @param count_miss switch determining if NA values should be added to the indvar frequencies as it's own category. if this is 'ifmiss' then missing will be added. if this is 'none' then missing will be excluded and a message will show number of records removed. default is 'ifmiss'
#' @param count_miss_lab if count_miss='ifmiss' meaning we want NA values included this argument determines what they are labeled as. default is 'Missing'
#' @keywords fp_msd_class2_msdmulti variables fp_msd_class2 multiple multi class fp_msd summary table1 table2
#' @export
#' @examples
#' fp_msd_class2_fp_msd_multi_function()
#'
en_kfold_model<-function(ddata,response_var,iter=10,k=10,num_alpha=20,seed=123,fit_met='accuracy',loo=FALSE,up_dn_samp='none',eq_wt=FALSE,type_meas="deviance",na_rm=TRUE,lr_cutoff=c(.5),accuracy_modeling=FALSE){
set.seed(seed)
if(up_dn_samp=='upsamp'){
ddata<-upSample(ddata,as.factor(ddata[,response_var]))
ddata<-ddata[,-ncol(ddata)]
}
if(up_dn_samp=='dwnsamp'){
ddata<-downSample(ddata,as.factor(ddata[,response_var]))
ddata<-ddata[,-ncol(ddata)]
}
print(paste("there are ",nrow(ddata)," cases in the training dataset"))
acc_data<-ddata
itern<-1
f <- as.formula(paste(response_var, "~", paste(names(ddata[,!names(ddata) %in% c(response_var)]), collapse=" + ")))
if(loo==TRUE){
k<-nrow(ddata)
iter<-1
}
pbar <- create_progress_bar('text')
pbar$init(iter*k*num_alpha*length(lr_cutoff))
list.of.fits <- list()
results <- data.frame()
loo_tracker<-vector()
print("Running k fold cross validation to find optimal alpha value")
for(i in 1:iter){
if(loo==FALSE){
ddata<-ddata[order(runif(nrow(ddata))),]
}
ddata$grp<-rep(1:k,length.out=nrow(ddata))
for(j in 1:k)
{
itername<-paste("iter_",i,"_fold_",j,sep = "")
if(loo==TRUE){
train<-ddata[-j,-ncol(ddata)]
test<-ddata[j,-ncol(ddata)]
} else {
train<-ddata[ddata$grp!=j,-ncol(ddata)]
test<-ddata[ddata$grp==j,-ncol(ddata)]
}
#print(mean(train[,1]))
#print(mean(test[,1]))
wt<-NULL
if(eq_wt==TRUE){
wt<-ifelse(train[,1]==0,
1-(sum(train[,1]==0)/nrow(train)),
1-(sum(train[,1]==1)/nrow(train)))
}
train_pred <- model.matrix(f,train)[,-1]
train_resp <- train[,response_var]
#print(head(train_pred))
#print(train_resp)
test_pred <- model.matrix(f,test)[,-1]
test_resp <- test[,response_var]
#print(head(test_pred))
#print(test_resp)
for(alphaa in 0:num_alpha) {
#print(runif(1))
fit.name<-paste("iter_",i,"_fold_",j,"_alpha_",(alphaa/num_alpha),sep = "")
## Now fit a model (i.e. optimize lambda) and store it in a list that
## uses the variable name we just created as the reference.
#print(head(wt))
list.of.fits[[fit.name]] <- cv.glmnet(train_pred,
train_resp,
weights = wt,
type.measure = type_meas,
alpha=alphaa/num_alpha,
family="binomial")
#print(list.of.fits[[fit.name]])
# predicted <- cton(predict(list.of.fits[[fit.name]],
# s=list.of.fits[[fit.name]]$lambda.1se,type="class", newx=test_pred))
predicted_prob <- cton(predict(list.of.fits[[fit.name]],
s=list.of.fits[[fit.name]]$lambda.1se,type="response", newx=test_pred))
test_resp<-cton(test_resp)
#print(paste("predddd",mean(predicted)))
for(l_cut in 1:length(lr_cutoff)){
lr_c<-lr_cutoff[l_cut]
#print(lr_c)
predicted <- cton(ifelse(predicted_prob>lr_c,1,0))
## Calculate the fit...CHANGED THIS TO LOG LOSS BASED ON INFORMATION FOUND HERE https://towardsdatascience.com/why-not-mse-as-a-loss-function-for-logistic-regression-589816b5e03c
if(fit_met=="accuracy"){
fit <- MLmetrics::Accuracy(y_pred = predicted,
y_true = test_resp)
} else if(fit_met=="auroc"){
fit <- MLmetrics::AUC(y_true = factor(test_resp,levels = c("1","0")),
y_pred = factor(predicted,levels = c("1","0")))
} else if(fit_met=="logloss"){
fit <- MLmetrics::LogLoss(y_pred = predicted_prob,
y_true = test_resp)
} else if(fit_met=="f1"){
fit <- MLmetrics::F1_Score(y_true = factor(test_resp,levels = c("1","0")),
y_pred = factor(predicted,levels = c("1","0")),positive = "1")
} else if(fit_met=="ppv"){
fit <- MLmetrics::Precision(y_true = factor(test_resp,levels = c("1","0")),
y_pred = factor(predicted,levels = c("1","0")),positive = "1")
} else if(fit_met=="npv"){
fit <- caret::negPredValue(factor(test_resp,levels = c("1","0")),
factor(predicted,levels = c("1","0")),positive = "1")
} else if(fit_met=="sens"){
fit <- MLmetrics::Sensitivity(y_true = factor(test_resp,levels = c("1","0")),
y_pred = factor(predicted,levels = c("1","0")),positive = "1")
} else if(fit_met=="spec"){
fit <- MLmetrics::Specificity(y_true = factor(test_resp,levels = c("1","0")),
y_pred = factor(predicted,levels = c("1","0")),positive = "1")
} else if(fit_met=="bal_acc"){
fit <- yardstick::bal_accuracy_vec(truth = factor(test_resp,levels = c("1","0")),
estimate = factor(predicted,levels = c("1","0")))
} else {
fit <- mean((test_resp - predicted)^2)
}
#print(lr_c)
## Store the results
temp <- data.frame(alpha=alphaa/num_alpha, fit=fit, fit.name=fit.name, class_cutoff = lr_c)
results <- rbind(results, temp)
#print(temp)
itern<-itern+1
pbar$step()
}
}
}
}
print(paste("n(%) of NA fit values ",sum(is.na(results$fit))," (",highlandr::percent((sum(is.na(results$fit))/length(results$fit)),digits=1),")",sep = ""))
agg_results<-aggregate(list(fit=results$fit),by=list(alpha=results$alpha, class_cutoff=results$class_cutoff),mean,na.action=na_rm)
agg_results_select<-agg_results[!is.na(agg_results$fit),]
#print(names(agg_results_select))
#print("look here")
#print(agg_results_select[agg_results_select$fit==max(agg_results_select$fit),"class_cutoff"])
#print(agg_results_select[agg_results_select$fit==min(agg_results_select$fit),"class_cutoff"])
print(paste("n(%) of NA agg results ",sum(is.na(agg_results$fit))," (",highlandr::percent((sum(is.na(agg_results$fit))/length(agg_results$fit)),digits=1),")",sep = ""))
#besten_mod_name<-as.character(agg_results_select[agg_results_select$fit==min(agg_results_select$fit),"fit.name"])
if(fit_met=="accuracy" | fit_met=="auroc" | fit_met=="f1" | fit_met=="ppv" | fit_met=="npv" | fit_met=="sens" | fit_met=="spec" | fit_met=="bal_acc"){
best_mod_alpha<-agg_results_select[agg_results_select$fit==max(agg_results_select$fit),"alpha"]
best_mod_cutoff<-agg_results_select[agg_results_select$fit==max(agg_results_select$fit),]$class_cutoff
#print(agg_results_select[agg_results_select$fit==max(agg_results_select$fit),"class_cutoff"])
#print(agg_results_select[agg_results_select$fit==max(agg_results_select$fit),])
} else {
best_mod_alpha<-agg_results_select[agg_results_select$fit==min(agg_results_select$fit),"alpha"]
best_mod_cutoff<-agg_results_select[agg_results_select$fit==min(agg_results_select$fit),]$class_cutoff
#print(best_mod_cutoff<-agg_results_select[agg_results_select$fit==min(agg_results_select$fit),])
}
#print(agg_results)
print("best alphas")
print(best_mod_alpha)
print("best cutoffs")
#print(names(best_mod_cutoff))
print(best_mod_cutoff)
x <- model.matrix(f,ddata)[,-1]
y <- ddata[,response_var]
if((length(best_mod_alpha)>1)&!accuracy_modeling){
print("This will take a litte while longer becase we need to examine performance metrics as there was a tie for best alpha")
best_mod_alpha_res<-en_kfold_accuracy_tied(tied_alphas=best_mod_alpha,
tied_cutoffs=best_mod_cutoff,
ddata = acc_data,
response_var = response_var,
eq_wt = eq_wt,
type_meas = type_meas)
best_mod_alpha<-best_mod_alpha_res$best_alpha_value
best_mod_cutoff<-best_mod_alpha_res$best_cutoff
} else {
best_mod_alpha_res<-"there was no need to use performance measures"
}
best_model<-cv.glmnet(x,
y,
type.measure=type_meas,
alpha=best_mod_alpha,
family="binomial")
if(eq_wt==TRUE){
print(y)
final_weights<-ifelse(y==0,
1-(sum(y==0)/length(y)),
1-(sum(y==1)/length(y)))
#print(final_weights)
best_model<-cv.glmnet(x,
y,
type.measure=type_meas,
alpha=best_mod_alpha,
weights = final_weights,
family = "binomial")
}
print("Calculating model performance")
if(loo==TRUE){
en_accuracy<-en_kfold_accuracy(ddata = acc_data,
response_var = response_var,
mod_alpha = best_mod_alpha,
loo = TRUE,
type_meas = type_meas,
lr_cutoff = best_mod_cutoff)
} else {
en_accuracy<-en_kfold_accuracy(ddata = acc_data,
response_var = response_var,
mod_alpha = best_mod_alpha,
iter = iter,
k = k,
type_meas = type_meas,
lr_cutoff = best_mod_cutoff)
}
# print(model.matrix(formula_maker(response_var,names(ddata)[!names(ddata) %in% c(response_var)]),ddata)[,-c(1,ncol(ddata))])
# print(predict(best_model,
# newx=model.matrix(formula_maker(response_var,names(ddata)[!names(ddata) %in% c(response_var)]),ddata)[,-c(1,ncol(ddata))],
# type="class"))
# print(ddata[,response_var])
best_model_internal_pred_probs<-predict(best_model,
newx=model.matrix(formula_maker(response_var,names(ddata)[!names(ddata) %in% c(response_var)]),ddata)[,-c(1,ncol(ddata))],
type="response")
best_model_internal_conf_mat<-caret::confusionMatrix(factor(ifelse(predict(best_model,
newx=model.matrix(formula_maker(response_var,names(ddata)[!names(ddata) %in% c(response_var)]),ddata)[,-c(1,ncol(ddata))],
type="response")>best_mod_cutoff,1,0),
levels = c("1","0")),
factor(ddata[,response_var],
levels = c("1","0")))$table
en_accuracy_results<-en_accuracy$kfold_results
en_accuracy_stats<-rown_to_var(data.frame("Value"=round(en_accuracy$kfold_agg_results,4)),varname = "statistic")
best_mod_ors_all<-cbind(rown_to_var(as.data.frame(as.matrix(round(exp(coef(best_model,best_model$lambda.1se)),3)))),
"variable_dropped"=as.vector(coef(best_model,best_model$lambda.1se))==0)
best_mod_betas<-rown_to_var(as.data.frame(as.matrix(round(coef(best_model,best_model$lambda.1se),3))))
print(paste("best alpha =",best_mod_alpha))
print(paste("best lambda =",best_model$lambda.1se))
print(paste("best cutoff =",best_mod_cutoff))
print(en_accuracy_stats)
print(best_model_internal_conf_mat)
return(list("best_en_model"=best_model,
"best_mod_ors_all"=best_mod_ors_all,
"best_mod_betas"=best_mod_betas,
"alpha_agg_results"=agg_results,
"best_mod_alpha"=best_mod_alpha,
"best_mod_lambda"=best_model$lambda.1se,
"best_mod_cutoff"=best_mod_cutoff,
"best_model_internal_conf_mat"=best_model_internal_conf_mat,
"best_model_internal_pred_probs"=best_model_internal_pred_probs,
"best_mod_alpha_res"=best_mod_alpha_res,
"en_accuracy_stats"=en_accuracy_stats,
"en_accuracy_results"=en_accuracy_results,
#"results"=results,
"nas_removed_from_results"=paste("n(%) of NA fit values ",sum(is.na(results$fit))," (",highlandr::percent((sum(is.na(results$fit))/length(results$fit)),digits=1),")",sep = ""),
"nas_removed_from_agg_results"=paste("n(%) of NA agg results ",sum(is.na(agg_results$fit))," (",highlandr::percent((sum(is.na(agg_results$fit))/length(agg_results$fit)),digits=1),")",sep = ""),
"list.of.fits"=list.of.fits))
}
#function to run enr mods using different metrics
#' fp_msd_class2_fp_msd_multi function
#'
#' Uses cross validation ENR functions to consecutively check maximization of multiple performance metrics (DOCUMENTATION COMING- CURRENT DOCUMENTATION INCORRECT)
#' @param data data frame containing the data to be summarized
#' @param indvars vector of variable names for the variables that are t be summarized. all variables must be present in the data frame
#' @param classvar optional class variable to cross with the independent variables. this is is typically presented as a text (e.g. data$variable)vector of names to change the rows in column 1 to. This is typically what you want the individual variables that the rows represent called in the table. this must be the same length as vars. default is NULL which simply returns vars.
#' @param fp_msd_vector_tf vector of boolean values indicating whether the variable should be 'fp' (true) or custom/msd (false). default is true
#' @param funct1_vector a vector of the first function to use (i.e. outside the parentheses) if fp='FALSE'. default is 'mean'. if supplying different functions be sure to quote e.g. "IQR"
#' @param funct2_vector a vector of the second function to use (i.e. inside the parentheses) if fp='FALSE'. default is 'sd'. if supplying different functions be sure to quote e.g. "IQR"
#' @param var_name_vec a vector of variable names to use in the table. If left as null the table will contain the variable names as listed in the data frame
#' @param sep_var_level boolean value indicating whether the variable name should be added to the table (TRUE) or not (FALSE). default is TRUE
#' @param remove_var_dups boolean indicating whether duplicate values of the variable name should be removed (TRUE) or included (FALSE). default is TRUE
#' @param shownval boolean indicating if the n value used after na removal should be displayed in non fp cases. this will only appear when fp=FASLE. default is TRUE
#' @param total boolean indicating if a 'total' column should be added to the data frame. default is TRUE
#' @param rnd_digs_vector the number of digits to round results to when fp=FALSE. default is 2
#' @param rownvar name of the first column (second column if sep_var_level=TRUE) which contains either indvar categories or function names. default is 'Level'
#' @param count_miss switch determining if NA values should be added to the indvar frequencies as it's own category. if this is 'ifmiss' then missing will be added. if this is 'none' then missing will be excluded and a message will show number of records removed. default is 'ifmiss'
#' @param count_miss_lab if count_miss='ifmiss' meaning we want NA values included this argument determines what they are labeled as. default is 'Missing'
#' @keywords fp_msd_class2_msdmulti variables fp_msd_class2 multiple multi class fp_msd summary table1 table2
#' @export
#' @examples
#' fp_msd_class2_fp_msd_multi_function()
#'
run_all_enr_fit_mets<-function(dat,response_var,tune_type="og",modname="model",specs=TRUE,date_tf=TRUE,time_tf=FALSE,ties_measure="mode",fit_mets=c("acc","balacc","ppv","f1","sens","auroc","npv","spec","logloss"),dir_name='I:/Lagisetty SDR Misuse/5. Identifiable Data/E. Database/treatment arm creation/treatment arm creation/enr mods/',iter=50,k=10,num_alpha=20,eq_wt = FALSE, lr_cutoff = seq(from = .05, to = .95, by = .05), ...){
list_of_files<-vector()
metric_comparisons<-data.frame()
metric_fitstats<-data.frame()
metric_ors<-data.frame()
metric_confmats<-list()
if(date_tf&time_tf){
date_time_var<-format(now(),"%m_%d_%Y_%H.%M.%S")
} else if (date_tf) {
date_time_var<-format(now(),"%m_%d_%Y")
} else if (time_tf){
date_time_var<-format(now(),"%H.%M.%S")
} else {
date_time_var<-""
}
if(specs){
specs_var<-paste("i",iter,"_k",k,"_a",num_alpha,"_c",length(lr_cutoff),"_ew",ifelse(eq_wt,"T_","F_"),sep = "")
} else {
specs_var<-""
}
pbar <- create_progress_bar('text')
pbar$init(length(fit_mets))
for(i in 1:length(fit_mets)){
print(fit_mets[i])
mod_name<-paste(modname,"_",fit_mets[i],"_",tune_type,"tune_",specs_var,date_time_var,sep = "")
if(tune_type=="og"){
mod<-en_kfold_model(ddata = dat,
response_var = response_var,
iter=iter,
k=k,
num_alpha=num_alpha,
eq_wt = eq_wt,
fit_met=fit_mets[i],
lr_cutoff = lr_cutoff,
...)
} else if(tune_type=="grid"){
mod<-en_kfold_model_grid(ddata = dat,
response_var = response_var,
iter=iter,
k=k,
num_alpha=num_alpha,
eq_wt = eq_wt,
fit_met=fit_mets[i],
lr_cutoff = lr_cutoff,
ties_measure="mode",
...)
} else if(tune_type=="grid_lim"){
mod<-en_kfold_model_grid_lim(ddata = dat,
response_var = response_var,
iter=iter,
k=k,
num_alpha=num_alpha,
eq_wt = eq_wt,
fit_met=fit_mets[i],
lr_cutoff = lr_cutoff,
ties_measure="mode",
...)
} else {
print("enter valid tune_type")
}
#filename<-paste(dir_name,mod_name,".RData",sep = "")
filename<-paste(dir_name,mod_name,".rds",sep = "")
list_of_files[i]<-filename
#save(mod,file=filename)
#do.call(save,list(mod,file=filename))
do.call(saveRDS,list(mod,file=filename))
if(i==1){
metric_comparisons<-data.frame(mod$en_accuracy_stats)
metric_ors<-data.frame(mod$best_mod_ors_all)
metric_fitstats[i,c("model","alpha","lambda")]<-c(mod_name,mod$best_mod_alpha,mod$best_mod_lambda)
metric_confmats[[fit_mets[i]]]<-mod$best_model_internal_conf_mat
metric_comparisons<-highlandr::rename.variables(metric_comparisons,fit_mets[i],"Value")
metric_ors<-highlandr::rename.variables(metric_ors,apply(expand.grid(fit_mets[i], c("or","drop")), 1, paste, collapse="_"),c("s1","variable_dropped"))
} else {
# metric_comparisons<-as.data.frame(cbind(metric_comparisons,mod$en_accuracy_stats$Value))
metric_comparisons[,fit_mets[i]]<-mod$en_accuracy_stats$Value
metric_fitstats[i,c("model","alpha","lambda")]<-c(mod_name,mod$best_mod_alpha,mod$best_mod_lambda)
metric_ors[,apply(expand.grid(fit_mets[i], c("or","drop")), 1, paste, collapse="_")]<-mod$best_mod_ors_all[,c("s1","variable_dropped")]
metric_confmats[[fit_mets[i]]]<-mod$best_model_internal_conf_mat
}
rm(mod)
print(metric_comparisons)
print("total complete")
pbar$step()
}
#colnames(metric_comparisons[,2:length(fit_mets)])<-fit_mets
ret<-list("list of files"=list_of_files,"metric_comparisons"=metric_comparisons,"metric_fitstats"=metric_fitstats,"metric_ors"=metric_ors,"metric_confmats"=metric_confmats)
#save(ret,file=paste(dir_name,"file_list_",modname,"_",date_time_var,".RData",sep = ""))
#do.call(save, list(ret,file=paste(dir_name,"file_list_",modname,"_",date_time_var,".RData",sep = "")))
do.call(saveRDS, list(ret,file=paste(dir_name,"file_list_",modname,"_",date_time_var,".rds",sep = "")))
return(ret)
}
#' fp_msd_class2_fp_msd_multi function
#'
#' retrieves ENR models from a directiory using the output from run_all_enr_fit_mets function (DOCUMENTATION COMING- CURRENT DOCUMENTATION INCORRECT)
#' @param data data frame containing the data to be summarized
#' @param indvars vector of variable names for the variables that are t be summarized. all variables must be present in the data frame
#' @param classvar optional class variable to cross with the independent variables. this is is typically presented as a text (e.g. data$variable)vector of names to change the rows in column 1 to. This is typically what you want the individual variables that the rows represent called in the table. this must be the same length as vars. default is NULL which simply returns vars.
#' @param fp_msd_vector_tf vector of boolean values indicating whether the variable should be 'fp' (true) or custom/msd (false). default is true
#' @param funct1_vector a vector of the first function to use (i.e. outside the parentheses) if fp='FALSE'. default is 'mean'. if supplying different functions be sure to quote e.g. "IQR"
#' @param funct2_vector a vector of the second function to use (i.e. inside the parentheses) if fp='FALSE'. default is 'sd'. if supplying different functions be sure to quote e.g. "IQR"
#' @param var_name_vec a vector of variable names to use in the table. If left as null the table will contain the variable names as listed in the data frame
#' @param sep_var_level boolean value indicating whether the variable name should be added to the table (TRUE) or not (FALSE). default is TRUE
#' @param remove_var_dups boolean indicating whether duplicate values of the variable name should be removed (TRUE) or included (FALSE). default is TRUE
#' @param shownval boolean indicating if the n value used after na removal should be displayed in non fp cases. this will only appear when fp=FASLE. default is TRUE
#' @param total boolean indicating if a 'total' column should be added to the data frame. default is TRUE
#' @param rnd_digs_vector the number of digits to round results to when fp=FALSE. default is 2
#' @param rownvar name of the first column (second column if sep_var_level=TRUE) which contains either indvar categories or function names. default is 'Level'
#' @param count_miss switch determining if NA values should be added to the indvar frequencies as it's own category. if this is 'ifmiss' then missing will be added. if this is 'none' then missing will be excluded and a message will show number of records removed. default is 'ifmiss'
#' @param count_miss_lab if count_miss='ifmiss' meaning we want NA values included this argument determines what they are labeled as. default is 'Missing'
#' @keywords fp_msd_class2_msdmulti variables fp_msd_class2 multiple multi class fp_msd summary table1 table2
#' @export
#' @examples
#' fp_msd_class2_fp_msd_multi_function()
#'
retrieve_enr_mods<-function(directory,list_of_files=NULL,all_files=FALSE,patterns=""){
if(!is.null(list_of_files)){
enr_mods<-list()
list_of_names<-sapply(lapply(sapply(list_of_files,function(x) strsplit(x,"/")),tail,n=1),function(x) strsplit(x,"\\.")[[1]][1])
#print(sapply(list_of_files,function(x) strsplit(x,"\\.")[[1]][1]))
print(list_of_names)
for(i in 1:length(list_of_files)){
enr_mods[list_of_names[i]]<-load(list_of_files[i])
}
} else if(all_files){
files <- list.files(directory)
if(patterns!=""){
for(j in 1:length(patterns)){
files <- files %>%
tibble() %>%
filter(stringr::str_detect(files, patterns[j])) %>%
pull()
}
}
enr_mods <- list() #create empty list
#loop through the files
for (k in files) {
#print(i)
enr_mods[[k]] <- get(load(paste0(directory, k))) #add files to list position
}
} else {
print("supply file names or switch all_files to TRUE")
}
return(enr_mods)
}
#' fp_msd_class2_fp_msd_multi function
#'
#' calculates accuracy of grid based ENR results (DOCUMENTATION COMING- CURRENT DOCUMENTATION INCORRECT)
#' @param data data frame containing the data to be summarized
#' @param indvars vector of variable names for the variables that are t be summarized. all variables must be present in the data frame
#' @param classvar optional class variable to cross with the independent variables. this is is typically presented as a text (e.g. data$variable)vector of names to change the rows in column 1 to. This is typically what you want the individual variables that the rows represent called in the table. this must be the same length as vars. default is NULL which simply returns vars.
#' @param fp_msd_vector_tf vector of boolean values indicating whether the variable should be 'fp' (true) or custom/msd (false). default is true
#' @param funct1_vector a vector of the first function to use (i.e. outside the parentheses) if fp='FALSE'. default is 'mean'. if supplying different functions be sure to quote e.g. "IQR"
#' @param funct2_vector a vector of the second function to use (i.e. inside the parentheses) if fp='FALSE'. default is 'sd'. if supplying different functions be sure to quote e.g. "IQR"
#' @param var_name_vec a vector of variable names to use in the table. If left as null the table will contain the variable names as listed in the data frame
#' @param sep_var_level boolean value indicating whether the variable name should be added to the table (TRUE) or not (FALSE). default is TRUE
#' @param remove_var_dups boolean indicating whether duplicate values of the variable name should be removed (TRUE) or included (FALSE). default is TRUE
#' @param shownval boolean indicating if the n value used after na removal should be displayed in non fp cases. this will only appear when fp=FASLE. default is TRUE
#' @param total boolean indicating if a 'total' column should be added to the data frame. default is TRUE
#' @param rnd_digs_vector the number of digits to round results to when fp=FALSE. default is 2
#' @param rownvar name of the first column (second column if sep_var_level=TRUE) which contains either indvar categories or function names. default is 'Level'
#' @param count_miss switch determining if NA values should be added to the indvar frequencies as it's own category. if this is 'ifmiss' then missing will be added. if this is 'none' then missing will be excluded and a message will show number of records removed. default is 'ifmiss'
#' @param count_miss_lab if count_miss='ifmiss' meaning we want NA values included this argument determines what they are labeled as. default is 'Missing'
#' @keywords fp_msd_class2_msdmulti variables fp_msd_class2 multiple multi class fp_msd summary table1 table2
#' @export
#' @examples
#' fp_msd_class2_fp_msd_multi_function()
#'
en_kfold_accuracy_grid<-function(ddata,response_var,mod_alpha,mod_lambda,iter=100,k=10,seed=123,loo=FALSE,eq_wt=FALSE,type_meas = "deviance",lr_cutoff=c(.5)){
print("cutoff")
print(lr_cutoff)
kfoldcv_results_data<-data.frame()
set.seed(seed)
itern<-1
ac_model_form <- as.formula(paste(response_var, "~", paste(names(ddata[,!names(ddata) %in% c(response_var)]), collapse=" + ")))
loo_tracker<-vector()
if(loo==TRUE){
k<-nrow(ddata)
iter<-1
}
pbar <- create_progress_bar('text')
pbar$init(iter*k)
for(i in 1:iter){
if(loo==FALSE){
ddata<-ddata[order(runif(nrow(ddata))),]
}
ddata$grp<-rep(1:k,length.out=nrow(ddata))
for(j in 1:k)
{
itername<-paste("cv_",i,"_reffold_",j,sep = "")
if(loo==TRUE){
train<-ddata[-j,-ncol(ddata)]
test<-ddata[j,-ncol(ddata)]
} else {
train<-ddata[ddata$grp!=k,-ncol(ddata)]
test<-ddata[ddata$grp==k,-ncol(ddata)]
}
wt<-NULL
if(eq_wt==TRUE){
wt<-ifelse(train[,1]==0,
1-(sum(train[,1]==0)/nrow(train)),
1-(sum(train[,1]==1)/nrow(train)))
}
train_pred <- model.matrix(ac_model_form,train)[,-1]
train_resp <- train[,response_var]
test_pred <- model.matrix(ac_model_form,test)[,-1]
if(loo==TRUE){
test_pred<-t(test_pred)
}
test_resp <- test[,response_var]
# Fitting
model <- glmnet(x = train_pred,
y = train_resp,
weights = wt,
type.measure=type_meas,
alpha=mod_alpha,
lambda=mod_lambda,
family="binomial")
# Predict results
# results_pred <- predict(model,newx=test_pred,type="class",s=model$lambda.1se)
results_pred_prob <- predict(model,newx=test_pred,type="response",s=mod_lambda)
#print(results_pred_prob)
#print(nrow(results_pred_prob))
#print(results_pred_prob)
#print(results_pred_prob>lr_cutoff)
results_pred <- ifelse((cton(predict(model,newx=test_pred,type="response",s=mod_lambda))>lr_cutoff),1,0)
#print(results_pred)
#print(nrow(results_pred))
results_pred<-factor(results_pred,levels = c("1","0"))
test_resp<-factor(test_resp,levels = c("1","0"))
#print(cbind(results_pred,test_resp,(results_pred_prob>lr_cutoff),lr_cutoff))
# Confusion matrix
cm <- caret::confusionMatrix(data = results_pred,
reference = test_resp,
positive = "1")
#AUC
mod_auc<-suppressMessages(roc(test_resp,cton(results_pred_prob))$auc)
# Collecting results
cm_results<-t(data.frame(c("iter_fold"=itername,cm$overall,cm$byClass,"AUC"=mod_auc)))
#rbind the results together
kfoldcv_results_data<-rbind(kfoldcv_results_data,cm_results)
itern<-itern+1
pbar$step()
}
}
# Average accuracy of the model
rownames(kfoldcv_results_data)<-1:nrow(kfoldcv_results_data)
kfoldcv_results<-colMeans(apply(kfoldcv_results_data[,2:ncol(kfoldcv_results_data)],2,cton),na.rm = TRUE)
kfold_results<-list("kfold_agg_results"=kfoldcv_results,"kfold_results"=kfoldcv_results_data)
return(kfold_results)
}
#' fp_msd_class2_fp_msd_multi function
#'
#' fucntion to calculate optimal cut performance (DOCUMENTATION COMING- CURRENT DOCUMENTATION INCORRECT)
#' @param data data frame containing the data to be summarized
#' @param indvars vector of variable names for the variables that are t be summarized. all variables must be present in the data frame
#' @param classvar optional class variable to cross with the independent variables. this is is typically presented as a text (e.g. data$variable)vector of names to change the rows in column 1 to. This is typically what you want the individual variables that the rows represent called in the table. this must be the same length as vars. default is NULL which simply returns vars.
#' @param fp_msd_vector_tf vector of boolean values indicating whether the variable should be 'fp' (true) or custom/msd (false). default is true
#' @param funct1_vector a vector of the first function to use (i.e. outside the parentheses) if fp='FALSE'. default is 'mean'. if supplying different functions be sure to quote e.g. "IQR"
#' @param funct2_vector a vector of the second function to use (i.e. inside the parentheses) if fp='FALSE'. default is 'sd'. if supplying different functions be sure to quote e.g. "IQR"
#' @param var_name_vec a vector of variable names to use in the table. If left as null the table will contain the variable names as listed in the data frame
#' @param sep_var_level boolean value indicating whether the variable name should be added to the table (TRUE) or not (FALSE). default is TRUE
#' @param remove_var_dups boolean indicating whether duplicate values of the variable name should be removed (TRUE) or included (FALSE). default is TRUE
#' @param shownval boolean indicating if the n value used after na removal should be displayed in non fp cases. this will only appear when fp=FASLE. default is TRUE
#' @param total boolean indicating if a 'total' column should be added to the data frame. default is TRUE
#' @param rnd_digs_vector the number of digits to round results to when fp=FALSE. default is 2
#' @param rownvar name of the first column (second column if sep_var_level=TRUE) which contains either indvar categories or function names. default is 'Level'
#' @param count_miss switch determining if NA values should be added to the indvar frequencies as it's own category. if this is 'ifmiss' then missing will be added. if this is 'none' then missing will be excluded and a message will show number of records removed. default is 'ifmiss'
#' @param count_miss_lab if count_miss='ifmiss' meaning we want NA values included this argument determines what they are labeled as. default is 'Missing'
#' @keywords fp_msd_class2_msdmulti variables fp_msd_class2 multiple multi class fp_msd summary table1 table2
#' @export
#' @examples
#' fp_msd_class2_fp_msd_multi_function()
#'
cut_performance<-function(predicted_probs,cutpoints,fit_met,y_true){
preds<-rename.variables(data.frame(sapply(cutpoints,function(x) factor(cton(ifelse(predicted_probs>x,1,0)),levels = c("1","0")))),paste("c_",cutpoints,sep=""))
#res<-lapply(preds,MLmetrics::Accuracy,y_true=factor(y_true,levels = c("1","0")))
#print(preds)
if(fit_met=="accuracy"){
fit <- lapply(preds,MLmetrics::Accuracy,y_true=factor(y_true,levels = c("1","0")))
} else if(fit_met=="auroc"){
fit <- lapply(preds,MLmetrics::AUC,y_true=factor(y_true,levels = c("1","0")))
} else if(fit_met=="logloss"){
fit <- lapply(preds,MLmetrics::LogLoss,y_true=factor(y_true,levels = c("1","0")))
} else if(fit_met=="f1"){
fit <- lapply(preds,MLmetrics::F1_Score,y_true=factor(y_true,levels = c("1","0")),positive = "1")
} else if(fit_met=="ppv"){
fit <- lapply(preds,MLmetrics::Precision,y_true=factor(y_true,levels = c("1","0")),positive = "1")
} else if(fit_met=="npv"){
fit <- lapply(preds,caret::negPredValue,reference=factor(y_true,levels = c("1","0")),positive = "1")
} else if(fit_met=="sens"){
fit <- lapply(preds,MLmetrics::Sensitivity,y_true=factor(y_true,levels = c("1","0")),positive = "1")
} else if(fit_met=="spec"){
fit <- lapply(preds,MLmetrics::Specificity,y_true=factor(y_true,levels = c("1","0")),positive = "1")
} else if(fit_met=="bal_acc"){
fit <- lapply(lapply(preds,factor,levels = c("1","0")),yardstick::bal_accuracy_vec,truth=factor(y_true,levels = c("1","0")))
} else {
fit <- lapply(preds,function(x) mean((y_true - x)^2))
}
return(unlist(fit))
}
#' fp_msd_class2_fp_msd_multi function
#'
#' Runs k fold cross validation ENR models with simultaneous estimation of alpha, lambda, and cutoff via randomized grid (DOCUMENTATION COMING- CURRENT DOCUMENTATION INCORRECT)
#' @param data data frame containing the data to be summarized
#' @param indvars vector of variable names for the variables that are t be summarized. all variables must be present in the data frame
#' @param classvar optional class variable to cross with the independent variables. this is is typically presented as a text (e.g. data$variable)vector of names to change the rows in column 1 to. This is typically what you want the individual variables that the rows represent called in the table. this must be the same length as vars. default is NULL which simply returns vars.
#' @param fp_msd_vector_tf vector of boolean values indicating whether the variable should be 'fp' (true) or custom/msd (false). default is true
#' @param funct1_vector a vector of the first function to use (i.e. outside the parentheses) if fp='FALSE'. default is 'mean'. if supplying different functions be sure to quote e.g. "IQR"
#' @param funct2_vector a vector of the second function to use (i.e. inside the parentheses) if fp='FALSE'. default is 'sd'. if supplying different functions be sure to quote e.g. "IQR"
#' @param var_name_vec a vector of variable names to use in the table. If left as null the table will contain the variable names as listed in the data frame
#' @param sep_var_level boolean value indicating whether the variable name should be added to the table (TRUE) or not (FALSE). default is TRUE
#' @param remove_var_dups boolean indicating whether duplicate values of the variable name should be removed (TRUE) or included (FALSE). default is TRUE
#' @param shownval boolean indicating if the n value used after na removal should be displayed in non fp cases. this will only appear when fp=FASLE. default is TRUE
#' @param total boolean indicating if a 'total' column should be added to the data frame. default is TRUE
#' @param rnd_digs_vector the number of digits to round results to when fp=FALSE. default is 2
#' @param rownvar name of the first column (second column if sep_var_level=TRUE) which contains either indvar categories or function names. default is 'Level'
#' @param count_miss switch determining if NA values should be added to the indvar frequencies as it's own category. if this is 'ifmiss' then missing will be added. if this is 'none' then missing will be excluded and a message will show number of records removed. default is 'ifmiss'
#' @param count_miss_lab if count_miss='ifmiss' meaning we want NA values included this argument determines what they are labeled as. default is 'Missing'
#' @keywords fp_msd_class2_msdmulti variables fp_msd_class2 multiple multi class fp_msd summary table1 table2
#' @export
#' @examples
#' fp_msd_class2_fp_msd_multi_function()
#'
en_kfold_model_grid_lim<-function(ddata,response_var,iter=10,k=10,num_alpha=20,num_lambda=100,seed=123,fit_met='accuracy',loo=FALSE,up_dn_samp='none',eq_wt=FALSE,type_meas="deviance",na_rm=TRUE,lr_cutoff=c(.5),accuracy_modeling=FALSE,ties_measure="mode",save_results = FALSE,writeout = T,writeout_num=num_alpha*5,writeout_path = "I:/Lagisetty SDR Misuse/5. Identifiable Data/E. Database/treatment arm creation/treatment arm creation/enr grid writeout directory/",restarting=FALSE){
start.time <- Sys.time()
set.seed(seed)
alpha_start <- 1
timercnt <- 1
# if(!is.factor(d1_enr[,"include_criteria"])){
# d1_enr[,"include_criteria"]<-factor(d1_enr[,"include_criteria"],levels=c(0,1),labels = c("0","1"))
# }
enr_grid<-expand.grid(alpha = seq(0,1,by=(1/(num_alpha))),
lambda = (10^seq(-3, 3, length = num_lambda))) %>% #,
#cutoff = lr_cutoff) %>%
arrange(alpha,lambda)#,cutoff)
#print(nrow(enr_grid))
if(restarting){
# List all the .rds files in the directory
rds_files <- list.files(path = writeout_path, pattern = paste0(fit_met,".*\\.rds$"), full.names = TRUE)
# Check if there are any .rds files in the directory
if (length(rds_files) > 0) {
if (length(rds_files) > 1) {
print("There is more than 1 file in directory, using the first one but double check")
}
first_rds_file <- rds_files[1]
restart_df <- readRDS(first_rds_file)
# print(enr_grid)
# print(restart_df)
#
# print(nrow(enr_grid))
# print(nrow(restart_df))
enr_grid<-anti_join(enr_grid,restart_df,by=c("alpha" = "alpha","lambda" = "lambda"))#,"cutoff" = "class_cutoff"))
timercnt <- nrow(enr_grid)
# print(enr_grid)
# print(nrow(enr_grid))
#alpha_start <- nrow(restart_df) + 1
} else {
cat("No .rds files found in the specified directory. Not limiting the data frame at all\n")
}
}
if(up_dn_samp=='upsamp'){
ddata<-upSample(ddata,as.factor(ddata[,response_var]))
ddata<-ddata[,-ncol(ddata)]
}
if(up_dn_samp=='dwnsamp'){
ddata<-downSample(ddata,as.factor(ddata[,response_var]))
ddata<-ddata[,-ncol(ddata)]
}
#print(restarting)
#print()
if(restarting&nrow(enr_grid)==0){
agg_results<-restart_df
f <- as.formula(paste(response_var, "~", paste(names(ddata[,!names(ddata) %in% c(response_var)]), collapse=" + ")))
acc_data<-ddata
} else {
print(paste("there are",nrow(ddata),"cases in the training dataset"))
print(paste("with",num_alpha,"alphas,",num_lambda,"lambdas, and",length(lr_cutoff),"cutoffs there are",format(nrow(enr_grid)*length(lr_cutoff),scientific=FALSE),"hyperparameter combinations which is",format(iter*k*nrow(enr_grid),scientific=FALSE),"models to run"))
acc_data<-ddata
itern<-1
f <- as.formula(paste(response_var, "~", paste(names(ddata[,!names(ddata) %in% c(response_var)]), collapse=" + ")))
if(loo==TRUE){
k<-nrow(ddata)
iter<-1
}
# pbar <- create_progress_bar('text')
# pbar$init(iter*k*nrow(enr_grid))
pbar<-txtProgressBar(min=0,max = (iter*k*nrow(enr_grid)),style = 3)
list.of.fits <- list()
results <- data.frame()
loo_tracker<-vector()
if(restarting){
agg_results<-restart_df
} else {
agg_results<-data.frame()
}
na_results<-0
previous_file_name<-NULL
print("Running k fold cross validation to find optimal alpha value")
for(alphaa in alpha_start:nrow(enr_grid)) {
#print(alpha_start)
#print(alphaa)
results<-data.frame()
for(i in 1:iter){
results<-data.frame()
if(loo==FALSE){
ddata<-ddata[order(runif(nrow(ddata))),]
}
ddata$grp<-rep(1:k,length.out=nrow(ddata))
for(j in 1:k){
itername<-paste("iter_",i,"_fold_",j,sep = "")
if(loo==TRUE){
train<-ddata[-j,-ncol(ddata)]
test<-ddata[j,-ncol(ddata)]
} else {
train<-ddata[ddata$grp!=j,-ncol(ddata)]
test<-ddata[ddata$grp==j,-ncol(ddata)]
}
wt<-NULL
if(eq_wt==TRUE){
wt<-ifelse(train[,1]==0,
1-(sum(train[,1]==0)/nrow(train)),
1-(sum(train[,1]==1)/nrow(train)))
}
train_pred <- model.matrix(f,train)[,-1]
train_resp <- train[,response_var]
test_pred <- model.matrix(f,test)[,-1]
test_resp <- test[,response_var]
fit.name<-paste("iter_",i,"_fold_",j,"_alpha_",(alphaa/num_alpha),sep = "")
mod_alpha<-enr_grid[alphaa,"alpha"]
mod_lambda<-enr_grid[alphaa,"lambda"]
#mod_cutoff<-enr_grid[alphaa,"cutoff"]
#print(enr_grid)
## Now fit a model (i.e. optimize lambda) and store it in a list that
## uses the variable name we just created as the reference.
model_fit <- glmnet(train_pred,
train_resp,
weights = wt,
alpha=mod_alpha,
lambda=mod_lambda,
family="binomial")
#print("made it 4")
predicted_prob <- cton(predict(model_fit,
s=mod_lambda,type="response", newx=test_pred))
#print("made it 5")
cut_perform<-cut_performance(predicted_prob,lr_cutoff,fit_met,test_resp)
#print("made it 6")
## Store the results
temp <- data.frame(alpha=rep(mod_alpha,length(lr_cutoff)),
lambda=rep(mod_lambda,length(lr_cutoff)),
class_cutoff = lr_cutoff,
fit=cut_perform,
fit.name=rep(itername,length(lr_cutoff)))
#print("made it 7")
results <- rbind(results, temp)
itern<-itern+1
setTxtProgressBar(pbar,(alphaa*i*j))
update_timer(start.time,Sys.time(),timercnt,(iter*k*nrow(enr_grid)))
timercnt <- timercnt+1
#pbar$step()
}
}
na_results<-na_results+sum(is.na(results$fit))
agg_results_proto<-aggregate(list(fit=results$fit),by=list(alpha=results$alpha, lambda=results$lambda, class_cutoff=results$class_cutoff),mean,na.action=na_rm)
agg_results<-rbind(agg_results,agg_results_proto)
if((alphaa%%writeout_num)==0&writeout){
if(restarting){
model_combos<-alphaa+(nrow(restart_df)/length(lr_cutoff))
} else {
model_combos<-alphaa
}
writ_path<-paste(writeout_path,"model_results_first_",format(now(),"%m_%d_%Y"),"_",fit_met,"_",model_combos,".rds",sep = "")
do.call(saveRDS,list(agg_results,file=writ_path))
end.time <- Sys.time()
dif_time<-(end.time - start.time)
if (!is.null(previous_file_name)) {
if (file.exists(previous_file_name)) {
file.remove(previous_file_name)
}
}
previous_file_name<-writ_path
}
}
print(paste("n(%) of NA fit values ",na_results," (",highlandr::percent((na_results/(iter*k*nrow(enr_grid))),digits=1),")",sep = ""))
}
agg_results_select<-agg_results[!is.na(agg_results$fit),]
#print(names(agg_results_select))
#print("look here")
#print(agg_results_select[agg_results_select$fit==max(agg_results_select$fit),"class_cutoff"])
#print(agg_results_select[agg_results_select$fit==min(agg_results_select$fit),"class_cutoff"])
print(paste("n(%) of NA agg results ",sum(is.na(agg_results$fit))," (",highlandr::percent((sum(is.na(agg_results$fit))/length(agg_results$fit)),digits=1),")",sep = ""))
#besten_mod_name<-as.character(agg_results_select[agg_results_select$fit==min(agg_results_select$fit),"fit.name"])
if(fit_met=="accuracy" | fit_met=="auroc" | fit_met=="f1" | fit_met=="ppv" | fit_met=="npv" | fit_met=="sens" | fit_met=="spec" | fit_met=="bal_acc"){
best_mod_alpha<-agg_results_select[agg_results_select$fit==max(agg_results_select$fit),"alpha"]
best_mod_lambda<-agg_results_select[agg_results_select$fit==max(agg_results_select$fit),"lambda"]
best_mod_cutoff<-agg_results_select[agg_results_select$fit==max(agg_results_select$fit),]$class_cutoff
#print(agg_results_select[agg_results_select$fit==max(agg_results_select$fit),"class_cutoff"])
#print(agg_results_select[agg_results_select$fit==max(agg_results_select$fit),])
} else {
best_mod_alpha<-agg_results_select[agg_results_select$fit==min(agg_results_select$fit),"alpha"]
best_mod_lambda<-agg_results_select[agg_results_select$fit==min(agg_results_select$fit),"lambda"]
best_mod_cutoff<-agg_results_select[agg_results_select$fit==min(agg_results_select$fit),]$class_cutoff
#print(best_mod_cutoff<-agg_results_select[agg_results_select$fit==min(agg_results_select$fit),])
}
#print(agg_results)
print("best alphas")
print(best_mod_alpha)
print("best lambdas")
print(best_mod_lambda)
print("best cutoffs")
print(best_mod_cutoff)
x <- model.matrix(f,ddata)[,-1]
y <- ddata[,response_var]
if((length(best_mod_alpha)>1)&!accuracy_modeling){
print("This will take a little while longer becase we need to examine performance metrics as there was a tie for best alpha")
best_mod_tied_res<-en_kfold_accuracy_tied_grid(tied_alphas=best_mod_alpha,
tied_lambdas=best_mod_lambda,
tied_cutoffs=best_mod_cutoff,
ddata = acc_data,
response_var = response_var,
eq_wt = eq_wt,
type_meas = type_meas)
if(ties_measure=="type_measure"){
best_mod_alpha<-best_mod_tied_res$type_meas_results$best_alpha_value[1]
best_mod_lambda<-best_mod_tied_res$type_meas_results$best_lambda_value[1]
best_mod_cutoff<-best_mod_tied_res$type_meas_results$best_cutoff_value[1]
} else{
#print(best_mod_tied_res)
best_mod_alpha<-best_mod_tied_res$mode_results$best_alpha_value
best_mod_lambda<-best_mod_tied_res$mode_results$best_lambda_value
best_mod_cutoff<-best_mod_tied_res$mode_results$best_cutoff_value
}
} else {
best_mod_tied_res<-"there was no need to use performance measures"
}
best_model<-glmnet(x,
y,
alpha=best_mod_alpha,
lambda=best_mod_lambda,
family="binomial")
if(eq_wt==TRUE){
print(y)
final_weights<-ifelse(y==0,
1-(sum(y==0)/length(y)),
1-(sum(y==1)/length(y)))
#print(final_weights)
best_model<-glmnet(x,
y,
alpha=best_mod_alpha,
lambda=best_mod_lambda,
weights = final_weights,
family = "binomial")
}
print("Calculating model performance")
if(loo==TRUE){
en_accuracy<-en_kfold_accuracy_grid(ddata = acc_data,
response_var = response_var,
mod_alpha = best_mod_alpha,
mod_lambda = best_mod_lambda,
loo = TRUE,
type_meas = type_meas,
lr_cutoff = best_mod_cutoff)
} else {
en_accuracy<-en_kfold_accuracy_grid(ddata = acc_data,
response_var = response_var,
mod_alpha = best_mod_alpha,
mod_lambda = best_mod_lambda,
iter = iter,
k = k,
type_meas = type_meas,
lr_cutoff = best_mod_cutoff)
}
# print(model.matrix(formula_maker(response_var,names(ddata)[!names(ddata) %in% c(response_var)]),ddata)[,-c(1,ncol(ddata))])
# print(predict(best_model,
# newx=model.matrix(formula_maker(response_var,names(ddata)[!names(ddata) %in% c(response_var)]),ddata)[,-c(1,ncol(ddata))],
# type="class"))
# print(ddata[,response_var])
if("grp" %in% colnames(ddata)) {
ddata <- ddata[, !(colnames(ddata) %in% c("grp"))]
}
print(model.matrix(formula_maker(response_var,names(ddata)[!names(ddata) %in% c(response_var)]),ddata)[,-1])
best_model_internal_pred_probs<-predict(best_model,
newx=model.matrix(formula_maker(response_var,names(ddata)[!names(ddata) %in% c(response_var)]),ddata)[,-1],
type="response")
#print("here2")
best_model_internal_conf_mat<-caret::confusionMatrix(factor(ifelse(predict(best_model,
newx=model.matrix(formula_maker(response_var,names(ddata)[!names(ddata) %in% c(response_var)]),ddata)[,-1],
type="response")>best_mod_cutoff,1,0),
levels = c("1","0")),
factor(ddata[,response_var],
levels = c("1","0")))$table
en_accuracy_results<-en_accuracy$kfold_results
en_accuracy_stats<-rown_to_var(data.frame("Value"=round(en_accuracy$kfold_agg_results,4)),varname = "statistic")
best_mod_ors_all<-cbind(rown_to_var(as.data.frame(as.matrix(round(exp(coef(best_model)),3)))),
"variable_dropped"=as.vector(coef(best_model))==0)
best_mod_betas<-rown_to_var(as.data.frame(as.matrix(round(coef(best_model),3))))
print(paste("best alpha =",best_mod_alpha))
print(paste("best lambda =",best_mod_lambda))
print(paste("best cutoff =",best_mod_cutoff))
print(en_accuracy_stats)
print(best_model_internal_conf_mat)
end.time <- Sys.time()
time.taken<-format_time_difference(end.time - start.time)
print(time.taken)
return(list("best_en_model"=best_model,
"best_mod_ors_all"=best_mod_ors_all,
"best_mod_betas"=best_mod_betas,
"alpha_agg_results"=agg_results,
"best_mod_alpha"=best_mod_alpha,
"best_mod_lambda"=best_mod_lambda,
"best_mod_cutoff"=best_mod_cutoff,
"best_model_internal_conf_mat"=best_model_internal_conf_mat,
"best_model_internal_pred_probs"=best_model_internal_pred_probs,
"best_mod_tied_res"=best_mod_tied_res,
"en_accuracy_stats"=en_accuracy_stats,
"en_accuracy_results"=en_accuracy_results,
"runtime"=time.taken,
#"results"=results,
#"nas_removed_from_results"=paste("n(%) of NA fit values ",sum(is.na(results$fit))," (",highlandr::percent((sum(is.na(results$fit))/length(results$fit)),digits=1),")",sep = ""),
"nas_removed_from_agg_results"=paste("n(%) of NA agg results ",sum(is.na(agg_results$fit))," (",highlandr::percent((sum(is.na(agg_results$fit))/length(agg_results$fit)),digits=1),")",sep = "")#,
#"list.of.fits"=list.of.fits
))
}
# en_kfold_model_grid<-function(ddata,response_var,iter=10,k=10,num_alpha=20,num_lambda=100,seed=123,fit_met='accuracy',loo=FALSE,up_dn_samp='none',eq_wt=FALSE,type_meas="deviance",na_rm=TRUE,lr_cutoff=c(.5),accuracy_modeling=FALSE){
# set.seed(seed)
#
# enr_grid<-expand.grid(alpha = seq(0,1,by=(1/num_alpha)),
# lambda = (10^seq(-3, 3, length = num_lambda)),
# cutoff = lr_cutoff)
#
# #print(enr_grid)
#
# if(up_dn_samp=='upsamp'){
# ddata<-upSample(ddata,as.factor(ddata[,response_var]))
# ddata<-ddata[,-ncol(ddata)]
# }
# if(up_dn_samp=='dwnsamp'){
# ddata<-downSample(ddata,as.factor(ddata[,response_var]))
# ddata<-ddata[,-ncol(ddata)]
# }
#
# print(paste("there are ",nrow(ddata)," cases in the training dataset"))
# acc_data<-ddata
# itern<-1
# f <- as.formula(paste(response_var, "~", paste(names(ddata[,!names(ddata) %in% c(response_var)]), collapse=" + ")))
#
# if(loo==TRUE){
# k<-nrow(ddata)
# iter<-1
# }
#
# pbar <- create_progress_bar('text')
# pbar$init(iter*k*nrow(enr_grid))
#
# list.of.fits <- list()
# results <- data.frame()
# loo_tracker<-vector()
#
# print("Running k fold cross validation to find optimal alpha value")
#
# for(i in 1:iter){
# if(loo==FALSE){
# ddata<-ddata[order(runif(nrow(ddata))),]
# }
# ddata$grp<-rep(1:k,length.out=nrow(ddata))
#
# for(j in 1:k)
# {
# itername<-paste("iter_",i,"_fold_",j,sep = "")
#
# if(loo==TRUE){
# train<-ddata[-j,-ncol(ddata)]
# test<-ddata[j,-ncol(ddata)]
# } else {
# train<-ddata[ddata$grp!=j,-ncol(ddata)]
# test<-ddata[ddata$grp==j,-ncol(ddata)]
# }
# #print(mean(train[,1]))
# #print(mean(test[,1]))
# wt<-NULL
# if(eq_wt==TRUE){
# wt<-ifelse(train[,1]==0,
# 1-(sum(train[,1]==0)/nrow(train)),
# 1-(sum(train[,1]==1)/nrow(train)))
# }
#
# train_pred <- model.matrix(f,train)[,-1]
# train_resp <- train[,response_var]
#
# #print(head(train_pred))
# #print(train_resp)
#
# test_pred <- model.matrix(f,test)[,-1]
# test_resp <- test[,response_var]
#
# #print(head(test_pred))
# #print(test_resp)
#
# for(alphaa in 1:nrow(enr_grid)) {
# #print(runif(1))
# fit.name<-paste("iter_",i,"_fold_",j,"_alpha_",(alphaa/num_alpha),sep = "")
#
# mod_alpha<-enr_grid[alphaa,"alpha"]
# mod_lambda<-enr_grid[alphaa,"lambda"]
# mod_cutoff<-enr_grid[alphaa,"cutoff"]
#
# ## Now fit a model (i.e. optimize lambda) and store it in a list that
# ## uses the variable name we just created as the reference.
# #print(head(wt))
# list.of.fits[[fit.name]] <- glmnet(train_pred,
# train_resp,
# weights = wt,
# #type.measure = type_meas,
# alpha=mod_alpha,
# lambda=mod_lambda,
# family="binomial")
#
# #print(list.of.fits[[fit.name]])
#
# # predicted <- cton(predict(list.of.fits[[fit.name]],
# # s=list.of.fits[[fit.name]]$lambda.1se,type="class", newx=test_pred))
#
# predicted_prob <- cton(predict(list.of.fits[[fit.name]],
# s=mod_lambda,type="response", newx=test_pred))
#
# test_resp<-cton(test_resp)
# #print(paste("predddd",mean(predicted)))
#
# for(l_cut in 1:length(lr_cutoff)){
# lr_c<-lr_cutoff[l_cut]
# #print(lr_c)
#
# predicted <- cton(ifelse(predicted_prob>lr_c,1,0))
#
# ## Calculate the fit...CHANGED THIS TO LOG LOSS BASED ON INFORMATION FOUND HERE https://towardsdatascience.com/why-not-mse-as-a-loss-function-for-logistic-regression-589816b5e03c
# if(fit_met=="accuracy"){
# fit <- MLmetrics::Accuracy(y_pred = predicted,
# y_true = test_resp)
# } else if(fit_met=="auroc"){
# fit <- MLmetrics::AUC(y_true = factor(test_resp,levels = c("1","0")),
# y_pred = factor(predicted,levels = c("1","0")))
# } else if(fit_met=="logloss"){
# fit <- MLmetrics::LogLoss(y_pred = predicted_prob,
# y_true = test_resp)
# } else if(fit_met=="f1"){
# fit <- MLmetrics::F1_Score(y_true = factor(test_resp,levels = c("1","0")),
# y_pred = factor(predicted,levels = c("1","0")),positive = "1")
# } else if(fit_met=="ppv"){
# fit <- MLmetrics::Precision(y_true = factor(test_resp,levels = c("1","0")),
# y_pred = factor(predicted,levels = c("1","0")),positive = "1")
# } else if(fit_met=="npv"){
# fit <- caret::negPredValue(factor(test_resp,levels = c("1","0")),
# factor(predicted,levels = c("1","0")),positive = "1")
# } else if(fit_met=="sens"){
# fit <- MLmetrics::Sensitivity(y_true = factor(test_resp,levels = c("1","0")),
# y_pred = factor(predicted,levels = c("1","0")),positive = "1")
# } else if(fit_met=="spec"){
# fit <- MLmetrics::Specificity(y_true = factor(test_resp,levels = c("1","0")),
# y_pred = factor(predicted,levels = c("1","0")),positive = "1")
# } else if(fit_met=="bal_acc"){
# fit <- yardstick::bal_accuracy_vec(truth = factor(test_resp,levels = c("1","0")),
# estimate = factor(predicted,levels = c("1","0")))
# } else {
# fit <- mean((test_resp - predicted)^2)
# }
#
# #print(lr_c)
#
# ## Store the results
# temp <- data.frame(alpha=mod_alpha, lambda=mod_lambda, class_cutoff = lr_c, fit=fit, fit.name=fit.name)
# results <- rbind(results, temp)
# #print(temp)
# itern<-itern+1
#
# pbar$step()
#
# }
#
#
# }
# }
# }
#
# print(paste("n(%) of NA fit values ",sum(is.na(results$fit))," (",highlandr::percent((sum(is.na(results$fit))/length(results$fit)),digits=1),")",sep = ""))
#
# agg_results<-aggregate(list(fit=results$fit),by=list(alpha=results$alpha, lambda=results$lambda, class_cutoff=results$class_cutoff),mean,na.action=na_rm)
# agg_results_select<-agg_results[!is.na(agg_results$fit),]
#
# #print(names(agg_results_select))
# #print("look here")
# #print(agg_results_select[agg_results_select$fit==max(agg_results_select$fit),"class_cutoff"])
# #print(agg_results_select[agg_results_select$fit==min(agg_results_select$fit),"class_cutoff"])
#
# print(paste("n(%) of NA agg results ",sum(is.na(agg_results$fit))," (",highlandr::percent((sum(is.na(agg_results$fit))/length(agg_results$fit)),digits=1),")",sep = ""))
#
# #besten_mod_name<-as.character(agg_results_select[agg_results_select$fit==min(agg_results_select$fit),"fit.name"])
# if(fit_met=="accuracy" | fit_met=="auroc" | fit_met=="f1" | fit_met=="ppv" | fit_met=="npv" | fit_met=="sens" | fit_met=="spec" | fit_met=="bal_acc"){
# best_mod_alpha<-agg_results_select[agg_results_select$fit==max(agg_results_select$fit),"alpha"]
# best_mod_lambda<-agg_results_select[agg_results_select$fit==max(agg_results_select$fit),"lambda"]
# best_mod_cutoff<-agg_results_select[agg_results_select$fit==max(agg_results_select$fit),]$class_cutoff
# #print(agg_results_select[agg_results_select$fit==max(agg_results_select$fit),"class_cutoff"])
# #print(agg_results_select[agg_results_select$fit==max(agg_results_select$fit),])
# } else {
# best_mod_alpha<-agg_results_select[agg_results_select$fit==min(agg_results_select$fit),"alpha"]
# best_mod_lambda<-agg_results_select[agg_results_select$fit==min(agg_results_select$fit),"lambda"]
# best_mod_cutoff<-agg_results_select[agg_results_select$fit==min(agg_results_select$fit),]$class_cutoff
# #print(best_mod_cutoff<-agg_results_select[agg_results_select$fit==min(agg_results_select$fit),])
# }
#
# #print(agg_results)
# print("best alphas")
# print(best_mod_alpha)
# print("best lambdas")
# print(best_mod_lambda)
# print("best cutoffs")
# print(best_mod_cutoff)
#
#
# x <- model.matrix(f,ddata)[,-1]
# y <- ddata[,response_var]
#
# if((length(best_mod_alpha)>1)&!accuracy_modeling){
# print("This will take a litte while longer becase we need to examine performance metrics as there was a tie for best alpha")
# best_mod_alpha_res<-en_kfold_accuracy_tied(tied_alphas=best_mod_alpha,
# tied_lambdas=best_mod_lambda,
# tied_cutoffs=best_mod_cutoff,
# ddata = acc_data,
# response_var = response_var,
# eq_wt = eq_wt,
# type_meas = type_meas)
# best_mod_alpha<-best_mod_alpha_res$best_alpha_value
# best_mod_lambda<-best_mod_lambda$best_lambda_value
# best_mod_cutoff<-best_mod_alpha_res$best_cutoff
# } else {
# best_mod_alpha_res<-"there was no need to use performance measures"
# }
#
# best_model<-glmnet(x,
# y,
# alpha=best_mod_alpha,
# lambda=best_mod_lambda,
# family="binomial")
#
# if(eq_wt==TRUE){
# print(y)
# final_weights<-ifelse(y==0,
# 1-(sum(y==0)/length(y)),
# 1-(sum(y==1)/length(y)))
#
# #print(final_weights)
#
# best_model<-glmnet(x,
# y,
# alpha=best_mod_alpha,
# lambda=best_mod_lambda,
# weights = final_weights,
# family = "binomial")
# }
#
# print("Calculating model performance")
#
# if(loo==TRUE){
# en_accuracy<-en_kfold_accuracy_grid(ddata = acc_data,
# response_var = response_var,
# mod_alpha = best_mod_alpha,
# mod_lambda = best_mod_lambda,
# loo = TRUE,
# type_meas = type_meas,
# lr_cutoff = best_mod_cutoff)
# } else {
# en_accuracy<-en_kfold_accuracy_grid(ddata = acc_data,
# response_var = response_var,
# mod_alpha = best_mod_alpha,
# mod_lambda = best_mod_lambda,
# iter = iter,
# k = k,
# type_meas = type_meas,
# lr_cutoff = best_mod_cutoff)
# }
#
# # print(model.matrix(formula_maker(response_var,names(ddata)[!names(ddata) %in% c(response_var)]),ddata)[,-c(1,ncol(ddata))])
# # print(predict(best_model,
# # newx=model.matrix(formula_maker(response_var,names(ddata)[!names(ddata) %in% c(response_var)]),ddata)[,-c(1,ncol(ddata))],
# # type="class"))
# # print(ddata[,response_var])
#
# best_model_internal_pred_probs<-predict(best_model,
# newx=model.matrix(formula_maker(response_var,names(ddata)[!names(ddata) %in% c(response_var)]),ddata)[,-c(1,ncol(ddata))],
# type="response")
#
# best_model_internal_conf_mat<-caret::confusionMatrix(factor(ifelse(predict(best_model,
# newx=model.matrix(formula_maker(response_var,names(ddata)[!names(ddata) %in% c(response_var)]),ddata)[,-c(1,ncol(ddata))],
# type="response")>best_mod_cutoff,1,0),
# levels = c("1","0")),
# factor(ddata[,response_var],
# levels = c("1","0")))$table
#
# en_accuracy_results<-en_accuracy$kfold_results
# en_accuracy_stats<-rown_to_var(data.frame("Value"=round(en_accuracy$kfold_agg_results,4)),varname = "statistic")
# best_mod_ors_all<-cbind(rown_to_var(as.data.frame(as.matrix(round(exp(coef(best_model)),3)))),
# "variable_dropped"=as.vector(coef(best_model))==0)
# best_mod_betas<-rown_to_var(as.data.frame(as.matrix(round(coef(best_model),3))))
#
# print(paste("best alpha =",best_mod_alpha))
# print(paste("best lambda =",best_mod_lambda))
# print(paste("best cutoff =",best_mod_cutoff))
# print(en_accuracy_stats)
# print(best_model_internal_conf_mat)
#
# return(list("best_en_model"=best_model,
# "best_mod_ors_all"=best_mod_ors_all,
# "best_mod_betas"=best_mod_betas,
# "alpha_agg_results"=agg_results,
# "best_mod_alpha"=best_mod_alpha,
# "best_mod_lambda"=best_mod_lambda,
# "best_mod_cutoff"=best_mod_cutoff,
# "best_model_internal_conf_mat"=best_model_internal_conf_mat,
# "best_model_internal_pred_probs"=best_model_internal_pred_probs,
# "best_mod_alpha_res"=best_mod_alpha_res,
# "en_accuracy_stats"=en_accuracy_stats,
# "en_accuracy_results"=en_accuracy_results,
# #"results"=results,
# "nas_removed_from_results"=paste("n(%) of NA fit values ",sum(is.na(results$fit))," (",highlandr::percent((sum(is.na(results$fit))/length(results$fit)),digits=1),")",sep = ""),
# "nas_removed_from_agg_results"=paste("n(%) of NA agg results ",sum(is.na(agg_results$fit))," (",highlandr::percent((sum(is.na(agg_results$fit))/length(agg_results$fit)),digits=1),")",sep = ""),
# "list.of.fits"=list.of.fits))
#
# }
# en_kfold_model_grid_old<-function(ddata,response_var,iter=10,k=10,num_alpha=20,num_lambda=100,seed=123,fit_met='accuracy',loo=FALSE,up_dn_samp='none',eq_wt=FALSE,type_meas="deviance",na_rm=TRUE,lr_cutoff=c(.5),accuracy_modeling=FALSE,save_results = FALSE){
# set.seed(seed)
#
# enr_grid<-expand.grid(alpha = seq(0,1,by=(1/num_alpha)),
# lambda = (10^seq(-3, 3, length = num_lambda)),
# cutoff = lr_cutoff)
#
# #print(enr_grid)
#
# if(up_dn_samp=='upsamp'){
# ddata<-upSample(ddata,as.factor(ddata[,response_var]))
# ddata<-ddata[,-ncol(ddata)]
# }
# if(up_dn_samp=='dwnsamp'){
# ddata<-downSample(ddata,as.factor(ddata[,response_var]))
# ddata<-ddata[,-ncol(ddata)]
# }
#
# print(paste("there are ",nrow(ddata)," cases in the training dataset"))
# acc_data<-ddata
# itern<-1
# f <- as.formula(paste(response_var, "~", paste(names(ddata[,!names(ddata) %in% c(response_var)]), collapse=" + ")))
#
# if(loo==TRUE){
# k<-nrow(ddata)
# iter<-1
# }
#
# pbar <- create_progress_bar('text')
# pbar$init(iter*k*nrow(enr_grid))
#
# list.of.fits <- list()
# results <- data.frame()
# loo_tracker<-vector()
# agg_results<-data.frame()
#
# print("Running k fold cross validation to find optimal alpha value")
#
# for(i in 1:iter){
#
# results<-data.frame()
#
# if(loo==FALSE){
# ddata<-ddata[order(runif(nrow(ddata))),]
# }
# ddata$grp<-rep(1:k,length.out=nrow(ddata))
#
# for(j in 1:k)
# {
# itername<-paste("iter_",i,"_fold_",j,sep = "")
#
# if(loo==TRUE){
# train<-ddata[-j,-ncol(ddata)]
# test<-ddata[j,-ncol(ddata)]
# } else {
# train<-ddata[ddata$grp!=j,-ncol(ddata)]
# test<-ddata[ddata$grp==j,-ncol(ddata)]
# }
# #print(mean(train[,1]))
# #print(mean(test[,1]))
# wt<-NULL
# if(eq_wt==TRUE){
# wt<-ifelse(train[,1]==0,
# 1-(sum(train[,1]==0)/nrow(train)),
# 1-(sum(train[,1]==1)/nrow(train)))
# }
#
# train_pred <- model.matrix(f,train)[,-1]
# train_resp <- train[,response_var]
#
# #print(head(train_pred))
# #print(train_resp)
#
# test_pred <- model.matrix(f,test)[,-1]
# test_resp <- test[,response_var]
#
# #print(head(test_pred))
# #print(test_resp)
#
# for(alphaa in 1:nrow(enr_grid)) {
# #print(runif(1))
# fit.name<-paste("iter_",i,"_fold_",j,"_alpha_",(alphaa/num_alpha),sep = "")
# if(alphaa==1){
# print(fit.name)
# }
#
# mod_alpha<-enr_grid[alphaa,"alpha"]
# mod_lambda<-enr_grid[alphaa,"lambda"]
# mod_cutoff<-enr_grid[alphaa,"cutoff"]
#
# ## Now fit a model (i.e. optimize lambda) and store it in a list that
# ## uses the variable name we just created as the reference.
# #print(head(wt))
# if(save_results){
# list.of.fits[[fit.name]] <- glmnet(train_pred,
# train_resp,
# weights = wt,
# #type.measure = type_meas,
# alpha=mod_alpha,
# lambda=mod_lambda,
# family="binomial")
#
# predicted_prob <- cton(predict(list.of.fits[[fit.name]],
# s=mod_lambda,type="response", newx=test_pred))
#
#
# } else {
# if(alphaa==1&k==1&iter==1){
# print(paste("lim =",save_results))
# }
#
# model_fit <- glmnet(train_pred,
# train_resp,
# weights = wt,
# #type.measure = type_meas,
# alpha=mod_alpha,
# lambda=mod_lambda,
# family="binomial")
#
# predicted_prob <- cton(predict(model_fit,
# s=mod_lambda,type="response", newx=test_pred))
# }
#
#
# test_resp<-cton(test_resp)
# #print(paste("predddd",mean(predicted)))
#
# for(l_cut in 1:length(lr_cutoff)){
# lr_c<-lr_cutoff[l_cut]
# #print(lr_c)
#
# predicted <- cton(ifelse(predicted_prob>lr_c,1,0))
#
# ## Calculate the fit...CHANGED THIS TO LOG LOSS BASED ON INFORMATION FOUND HERE https://towardsdatascience.com/why-not-mse-as-a-loss-function-for-logistic-regression-589816b5e03c
# if(fit_met=="accuracy"){
# fit <- MLmetrics::Accuracy(y_pred = predicted,
# y_true = test_resp)
# } else if(fit_met=="auroc"){
# fit <- MLmetrics::AUC(y_true = factor(test_resp,levels = c("1","0")),
# y_pred = factor(predicted,levels = c("1","0")))
# } else if(fit_met=="logloss"){
# fit <- MLmetrics::LogLoss(y_pred = predicted_prob,
# y_true = test_resp)
# } else if(fit_met=="f1"){
# fit <- MLmetrics::F1_Score(y_true = factor(test_resp,levels = c("1","0")),
# y_pred = factor(predicted,levels = c("1","0")),positive = "1")
# } else if(fit_met=="ppv"){
# fit <- MLmetrics::Precision(y_true = factor(test_resp,levels = c("1","0")),
# y_pred = factor(predicted,levels = c("1","0")),positive = "1")
# } else if(fit_met=="npv"){
# fit <- caret::negPredValue(factor(test_resp,levels = c("1","0")),
# factor(predicted,levels = c("1","0")),positive = "1")
# } else if(fit_met=="sens"){
# fit <- MLmetrics::Sensitivity(y_true = factor(test_resp,levels = c("1","0")),
# y_pred = factor(predicted,levels = c("1","0")),positive = "1")
# } else if(fit_met=="spec"){
# fit <- MLmetrics::Specificity(y_true = factor(test_resp,levels = c("1","0")),
# y_pred = factor(predicted,levels = c("1","0")),positive = "1")
# } else if(fit_met=="bal_acc"){
# fit <- yardstick::bal_accuracy_vec(truth = factor(test_resp,levels = c("1","0")),
# estimate = factor(predicted,levels = c("1","0")))
# } else {
# fit <- mean((test_resp - predicted)^2)
# }
#
# #print(lr_c)
#
# ## Store the results
# temp <- data.frame(alpha=mod_alpha, lambda=mod_lambda, class_cutoff = lr_c, fit=fit, fit.name=fit.name)
# results <- rbind(results, temp)
# #print(temp)
# itern<-itern+1
#
# pbar$step()
#
# }
#
# }
#
# }
#
# agg_results<-rbind(agg_results,
# aggregate(list(fit=results$fit),by=list(alpha=results$alpha, lambda=results$lambda, class_cutoff=results$class_cutoff),mean,na.action=na_rm))
#
# }
#
# print(paste("n(%) of NA fit values ",sum(is.na(results$fit))," (",highlandr::percent((sum(is.na(results$fit))/length(results$fit)),digits=1),")",sep = ""))
#
# agg_results_select<-agg_results[!is.na(agg_results$fit),]
#
# #print(names(agg_results_select))
# #print("look here")
# #print(agg_results_select[agg_results_select$fit==max(agg_results_select$fit),"class_cutoff"])
# #print(agg_results_select[agg_results_select$fit==min(agg_results_select$fit),"class_cutoff"])
#
# print(paste("n(%) of NA agg results ",sum(is.na(agg_results$fit))," (",highlandr::percent((sum(is.na(agg_results$fit))/length(agg_results$fit)),digits=1),")",sep = ""))
#
# #besten_mod_name<-as.character(agg_results_select[agg_results_select$fit==min(agg_results_select$fit),"fit.name"])
# if(fit_met=="accuracy" | fit_met=="auroc" | fit_met=="f1" | fit_met=="ppv" | fit_met=="npv" | fit_met=="sens" | fit_met=="spec" | fit_met=="bal_acc"){
# best_mod_alpha<-agg_results_select[agg_results_select$fit==max(agg_results_select$fit),"alpha"]
# best_mod_lambda<-agg_results_select[agg_results_select$fit==max(agg_results_select$fit),"lambda"]
# best_mod_cutoff<-agg_results_select[agg_results_select$fit==max(agg_results_select$fit),]$class_cutoff
# #print(agg_results_select[agg_results_select$fit==max(agg_results_select$fit),"class_cutoff"])
# #print(agg_results_select[agg_results_select$fit==max(agg_results_select$fit),])
# } else {
# best_mod_alpha<-agg_results_select[agg_results_select$fit==min(agg_results_select$fit),"alpha"]
# best_mod_lambda<-agg_results_select[agg_results_select$fit==min(agg_results_select$fit),"lambda"]
# best_mod_cutoff<-agg_results_select[agg_results_select$fit==min(agg_results_select$fit),]$class_cutoff
# #print(best_mod_cutoff<-agg_results_select[agg_results_select$fit==min(agg_results_select$fit),])
# }
#
# #print(agg_results)
# print("best alphas")
# print(best_mod_alpha)
# print("best lambdas")
# print(best_mod_lambda)
# print("best cutoffs")
# print(best_mod_cutoff)
#
#
# x <- model.matrix(f,ddata)[,-1]
# y <- ddata[,response_var]
#
# if((length(best_mod_alpha)>1)&!accuracy_modeling){
# print("This will take a litte while longer becase we need to examine performance metrics as there was a tie for best alpha")
# best_mod_alpha_res<-en_kfold_accuracy_tied(tied_alphas=best_mod_alpha,
# tied_lambdas=best_mod_lambda,
# tied_cutoffs=best_mod_cutoff,
# ddata = acc_data,
# response_var = response_var,
# eq_wt = eq_wt,
# type_meas = type_meas)
# best_mod_alpha<-best_mod_alpha_res$best_alpha_value
# best_mod_lambda<-best_mod_lambda$best_lambda_value
# best_mod_cutoff<-best_mod_alpha_res$best_cutoff
# } else {
# best_mod_alpha_res<-"there was no need to use performance measures"
# }
#
# best_model<-glmnet(x,
# y,
# alpha=best_mod_alpha,
# lambda=best_mod_lambda,
# family="binomial")
#
# if(eq_wt==TRUE){
# print(y)
# final_weights<-ifelse(y==0,
# 1-(sum(y==0)/length(y)),
# 1-(sum(y==1)/length(y)))
#
# #print(final_weights)
#
# best_model<-glmnet(x,
# y,
# alpha=best_mod_alpha,
# lambda=best_mod_lambda,
# weights = final_weights,
# family = "binomial")
# }
#
# print("Calculating model performance")
#
# if(loo==TRUE){
# en_accuracy<-en_kfold_accuracy_grid(ddata = acc_data,
# response_var = response_var,
# mod_alpha = best_mod_alpha,
# mod_lambda = best_mod_lambda,
# loo = TRUE,
# type_meas = type_meas,
# lr_cutoff = best_mod_cutoff)
# } else {
# en_accuracy<-en_kfold_accuracy_grid(ddata = acc_data,
# response_var = response_var,
# mod_alpha = best_mod_alpha,
# mod_lambda = best_mod_lambda,
# iter = iter,
# k = k,
# type_meas = type_meas,
# lr_cutoff = best_mod_cutoff)
# }
#
# # print(model.matrix(formula_maker(response_var,names(ddata)[!names(ddata) %in% c(response_var)]),ddata)[,-c(1,ncol(ddata))])
# # print(predict(best_model,
# # newx=model.matrix(formula_maker(response_var,names(ddata)[!names(ddata) %in% c(response_var)]),ddata)[,-c(1,ncol(ddata))],
# # type="class"))
# # print(ddata[,response_var])
#
# best_model_internal_pred_probs<-predict(best_model,
# newx=model.matrix(formula_maker(response_var,names(ddata)[!names(ddata) %in% c(response_var)]),ddata)[,-c(1,ncol(ddata))],
# type="response")
#
# best_model_internal_conf_mat<-caret::confusionMatrix(factor(ifelse(predict(best_model,
# newx=model.matrix(formula_maker(response_var,names(ddata)[!names(ddata) %in% c(response_var)]),ddata)[,-c(1,ncol(ddata))],
# type="response")>best_mod_cutoff,1,0),
# levels = c("1","0")),
# factor(ddata[,response_var],
# levels = c("1","0")))$table
#
# en_accuracy_results<-en_accuracy$kfold_results
# en_accuracy_stats<-rown_to_var(data.frame("Value"=round(en_accuracy$kfold_agg_results,4)),varname = "statistic")
# best_mod_ors_all<-cbind(rown_to_var(as.data.frame(as.matrix(round(exp(coef(best_model)),3)))),
# "variable_dropped"=as.vector(coef(best_model))==0)
# best_mod_betas<-rown_to_var(as.data.frame(as.matrix(round(coef(best_model),3))))
#
# print(paste("best alpha =",best_mod_alpha))
# print(paste("best lambda =",best_mod_lambda))
# print(paste("best cutoff =",best_mod_cutoff))
# print(en_accuracy_stats)
# print(best_model_internal_conf_mat)
#
# return(list("best_en_model"=best_model,
# "best_mod_ors_all"=best_mod_ors_all,
# "best_mod_betas"=best_mod_betas,
# "alpha_agg_results"=agg_results,
# "best_mod_alpha"=best_mod_alpha,
# "best_mod_lambda"=best_mod_lambda,
# "best_mod_cutoff"=best_mod_cutoff,
# "best_model_internal_conf_mat"=best_model_internal_conf_mat,
# "best_model_internal_pred_probs"=best_model_internal_pred_probs,
# "best_mod_alpha_res"=best_mod_alpha_res,
# "en_accuracy_stats"=en_accuracy_stats,
# "en_accuracy_results"=en_accuracy_results,
# #"results"=results,
# "nas_removed_from_results"=paste("n(%) of NA fit values ",sum(is.na(results$fit))," (",highlandr::percent((sum(is.na(results$fit))/length(results$fit)),digits=1),")",sep = ""),
# "nas_removed_from_agg_results"=paste("n(%) of NA agg results ",sum(is.na(agg_results$fit))," (",highlandr::percent((sum(is.na(agg_results$fit))/length(agg_results$fit)),digits=1),")",sep = "")#,
# #"list.of.fits"=list.of.fits
# ))
#
# }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.